Project Approach

The goal of this project is to predict house prices. The approach which we used begins with Exploratory Data Analysis (EDA). We then used Data visualization and plots with ggplot to study the data. We cleaned the data and preprocessed it to create a model. We used SalePrice as our main predictor and then tried to find correlated variables to see what variables would be useful in predicting house prices. We narrowed it down to the best variables and then we used a linear regression model to predict the home prices with the variables that add value to home prices. To verify our results, we cross-validated our predicted values, which allowed us to see if the results were consistent among the different models employed.

Libraries

Loading the required libraries

This code will install the missing libraries and then load them.

packages <- c("ggplot2","dplyr","plyr","tidyverse","data.table","knitr","kableExtra","caret","corrplot","randomForest","Hmisc","moments","psych","glmnet","scales","ggrepel","cowplot")

for(pkg in packages){
  if (pkg %in% rownames(installed.packages()) == FALSE)
  {install.packages(pkg)}
  if (pkg %in% rownames(.packages()) == FALSE)
  {library(pkg, character.only = TRUE)}
}

Importing data

We are using read.csv to read data into train and test variables, setting character variables to be loaded as characters instead of factors.

train <- read.csv("ames_train.csv", stringsAsFactors = FALSE)
test <- read.csv("ames_test.csv", stringsAsFactors = FALSE)

dim(train)
## [1] 1460   81
dim(test)
## [1] 1459   80
glimpse(train)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,…
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R…
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, …
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612…
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", …
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", …
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", …
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu…
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I…
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", …
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "…
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",…
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", …
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", …
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi…
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,…
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,…
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19…
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19…
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G…
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "…
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "…
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "…
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",…
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, …
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "…
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T…
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T…
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N…
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", …
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99…
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", …
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17…
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10…
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", …
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E…
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S…
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, …
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,…
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10…
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,…
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,…
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,…
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,…
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T…
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6…
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", …
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,…
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", …
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch…
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19…
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", …
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,…
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7…
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G…
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T…
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160…
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,…
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, …
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, …
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,…
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, …
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,…
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10…
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20…
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W…
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm…
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, …

Combining datasets

Adding SalePrice column in the test dataset to make the variables equal in both train and test datasets.

test$SalePrice <- NA

complete_data_set <- rbind(train, test) #Binding train & test data

complete_data_set <- cbind(complete_data_set, Set = c(rep("Train", times = dim(train)[1]),
                                rep("Test", times = dim(test)[1]))) 


dim(complete_data_set)
## [1] 2919   82

Checking the summary of all the variables like mean, median and sd

describe(complete_data_set, fast=TRUE)
##               vars    n      mean       sd   min    max  range      se
## Id               1 2919   1460.00   842.79     1   2919   2918   15.60
## MSSubClass       2 2919     57.14    42.52    20    190    170    0.79
## MSZoning         3 2915       NaN       NA   Inf   -Inf   -Inf      NA
## LotFrontage      4 2433     69.31    23.34    21    313    292    0.47
## LotArea          5 2919  10168.11  7887.00  1300 215245 213945  145.98
## Street           6 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Alley            7  198       NaN       NA   Inf   -Inf   -Inf      NA
## LotShape         8 2919       NaN       NA   Inf   -Inf   -Inf      NA
## LandContour      9 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Utilities       10 2917       NaN       NA   Inf   -Inf   -Inf      NA
## LotConfig       11 2919       NaN       NA   Inf   -Inf   -Inf      NA
## LandSlope       12 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Neighborhood    13 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Condition1      14 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Condition2      15 2919       NaN       NA   Inf   -Inf   -Inf      NA
## BldgType        16 2919       NaN       NA   Inf   -Inf   -Inf      NA
## HouseStyle      17 2919       NaN       NA   Inf   -Inf   -Inf      NA
## OverallQual     18 2919      6.09     1.41     1     10      9    0.03
## OverallCond     19 2919      5.56     1.11     1      9      8    0.02
## YearBuilt       20 2919   1971.31    30.29  1872   2010    138    0.56
## YearRemodAdd    21 2919   1984.26    20.89  1950   2010     60    0.39
## RoofStyle       22 2919       NaN       NA   Inf   -Inf   -Inf      NA
## RoofMatl        23 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Exterior1st     24 2918       NaN       NA   Inf   -Inf   -Inf      NA
## Exterior2nd     25 2918       NaN       NA   Inf   -Inf   -Inf      NA
## MasVnrType      26 2895       NaN       NA   Inf   -Inf   -Inf      NA
## MasVnrArea      27 2896    102.20   179.33     0   1600   1600    3.33
## ExterQual       28 2919       NaN       NA   Inf   -Inf   -Inf      NA
## ExterCond       29 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Foundation      30 2919       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtQual        31 2838       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtCond        32 2837       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtExposure    33 2837       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtFinType1    34 2840       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtFinSF1      35 2918    441.42   455.61     0   5644   5644    8.43
## BsmtFinType2    36 2839       NaN       NA   Inf   -Inf   -Inf      NA
## BsmtFinSF2      37 2918     49.58   169.21     0   1526   1526    3.13
## BsmtUnfSF       38 2918    560.77   439.54     0   2336   2336    8.14
## TotalBsmtSF     39 2918   1051.78   440.77     0   6110   6110    8.16
## Heating         40 2919       NaN       NA   Inf   -Inf   -Inf      NA
## HeatingQC       41 2919       NaN       NA   Inf   -Inf   -Inf      NA
## CentralAir      42 2919       NaN       NA   Inf   -Inf   -Inf      NA
## Electrical      43 2918       NaN       NA   Inf   -Inf   -Inf      NA
## X1stFlrSF       44 2919   1159.58   392.36   334   5095   4761    7.26
## X2ndFlrSF       45 2919    336.48   428.70     0   2065   2065    7.93
## LowQualFinSF    46 2919      4.69    46.40     0   1064   1064    0.86
## GrLivArea       47 2919   1500.76   506.05   334   5642   5308    9.37
## BsmtFullBath    48 2917      0.43     0.52     0      3      3    0.01
## BsmtHalfBath    49 2917      0.06     0.25     0      2      2    0.00
## FullBath        50 2919      1.57     0.55     0      4      4    0.01
## HalfBath        51 2919      0.38     0.50     0      2      2    0.01
## BedroomAbvGr    52 2919      2.86     0.82     0      8      8    0.02
## KitchenAbvGr    53 2919      1.04     0.21     0      3      3    0.00
## KitchenQual     54 2918       NaN       NA   Inf   -Inf   -Inf      NA
## TotRmsAbvGrd    55 2919      6.45     1.57     2     15     13    0.03
## Functional      56 2917       NaN       NA   Inf   -Inf   -Inf      NA
## Fireplaces      57 2919      0.60     0.65     0      4      4    0.01
## FireplaceQu     58 1499       NaN       NA   Inf   -Inf   -Inf      NA
## GarageType      59 2762       NaN       NA   Inf   -Inf   -Inf      NA
## GarageYrBlt     60 2760   1978.11    25.57  1895   2207    312    0.49
## GarageFinish    61 2760       NaN       NA   Inf   -Inf   -Inf      NA
## GarageCars      62 2918      1.77     0.76     0      5      5    0.01
## GarageArea      63 2918    472.87   215.39     0   1488   1488    3.99
## GarageQual      64 2760       NaN       NA   Inf   -Inf   -Inf      NA
## GarageCond      65 2760       NaN       NA   Inf   -Inf   -Inf      NA
## PavedDrive      66 2919       NaN       NA   Inf   -Inf   -Inf      NA
## WoodDeckSF      67 2919     93.71   126.53     0   1424   1424    2.34
## OpenPorchSF     68 2919     47.49    67.58     0    742    742    1.25
## EnclosedPorch   69 2919     23.10    64.24     0   1012   1012    1.19
## X3SsnPorch      70 2919      2.60    25.19     0    508    508    0.47
## ScreenPorch     71 2919     16.06    56.18     0    576    576    1.04
## PoolArea        72 2919      2.25    35.66     0    800    800    0.66
## PoolQC          73   10       NaN       NA   Inf   -Inf   -Inf      NA
## Fence           74  571       NaN       NA   Inf   -Inf   -Inf      NA
## MiscFeature     75  105       NaN       NA   Inf   -Inf   -Inf      NA
## MiscVal         76 2919     50.83   567.40     0  17000  17000   10.50
## MoSold          77 2919      6.21     2.71     1     12     11    0.05
## YrSold          78 2919   2007.79     1.31  2006   2010      4    0.02
## SaleType        79 2918       NaN       NA   Inf   -Inf   -Inf      NA
## SaleCondition   80 2919       NaN       NA   Inf   -Inf   -Inf      NA
## SalePrice       81 1460 180921.20 79442.50 34900 755000 720100 2079.11
## Set             82 2919       NaN       NA   Inf   -Inf   -Inf      NA

Data Exploration and cleaning

#To check the class (character, numeric, integer) of the variables in the data set

table(sapply(complete_data_set, class)) 
## 
## character   integer 
##        44        38

The dataset consists of 43 characters and 37 integer variables, and 35 columns with missing values (NA values).

Response variable - SalePrice

# Building a dataset with response variable SalePrice and its log transformation

data <- complete_data_set %>%
          select("SalePrice") %>%
          na.omit %>%
          mutate(LogSalePrice = log(SalePrice+1))

summary(data$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000
cat("Skewness: ",skewness(data$SalePrice),"\n")
## Skewness:  1.880941
cat("Kurtosis: ",kurtosis(data$SalePrice))
## Kurtosis:  9.509812

A positive skew indicates that the tail is on the right side of the distribution, which extends towards more positive values and a kurtosis greater than 3, it is said to be leptokurtic, which means it tends to produce more outliers than the normal distribution. To correct this we have used the log transformation of the response variable.

Histogram plots of SalePrice

#Histogram plots of SalePrice and its log transformation

hist1 <- ggplot(data, aes(x = SalePrice, fill = ..count..), add=TRUE) +
  geom_histogram(binwidth = 5000) +
  ggtitle("Plot 1: Histogram plot of SalePrice") +
  ylab("Frequency (Number of houses)") +
  xlab("SalePrice") + 
  theme(plot.title = element_text(hjust = 0.5))

hist2 <- ggplot(data, aes(x = LogSalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 0.05) +
  ggtitle("Plot 2: Histogram plot of log(SalePrice)") +
  ylab("Frequency (Number of houses)") +
  xlab("log(SalePrice)") + 
  theme(plot.title = element_text(hjust = 0.5))

cowplot::plot_grid(hist1, hist2)

Histogram plot to the left is positively skewed. Plot to the right represents the log transformation of the same response variable which follows normal distribution.

Predictor Variables

Checking the count of NA values

variables_with_na <- which(colSums(is.na(complete_data_set)) > 0)

sort(colSums(sapply(complete_data_set[variables_with_na], is.na)), decreasing = TRUE)
##       PoolQC  MiscFeature        Alley        Fence    SalePrice  FireplaceQu 
##         2909         2814         2721         2348         1459         1420 
##  LotFrontage  GarageYrBlt GarageFinish   GarageQual   GarageCond   GarageType 
##          486          159          159          159          159          157 
##     BsmtCond BsmtExposure     BsmtQual BsmtFinType2 BsmtFinType1   MasVnrType 
##           82           82           81           80           79           24 
##   MasVnrArea     MSZoning    Utilities BsmtFullBath BsmtHalfBath   Functional 
##           23            4            2            2            2            2 
##  Exterior1st  Exterior2nd   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF 
##            1            1            1            1            1            1 
##   Electrical  KitchenQual   GarageCars   GarageArea     SaleType 
##            1            1            1            1            1

We studied the NA values of all the variables above and found some discrepancies and some missing values that we corrected.

Interpreting the NA values

#Special Case: Basement specific variables, there is a mismatch in NA values in the below variables.

complete_data_set[ !is.na(complete_data_set$BsmtFinType1) & (   
                    is.na(complete_data_set$BsmtCond) | 
                      is.na(complete_data_set$BsmtQual) | 
                      is.na(complete_data_set$BsmtExposure) | 
                      is.na(complete_data_set$BsmtFinType2)), 
                   c('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')
                  ]
##      BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
## 333        Gd       TA           No          GLQ         <NA>
## 949        Gd       TA         <NA>          Unf          Unf
## 1488       Gd       TA         <NA>          Unf          Unf
## 2041       Gd     <NA>           Mn          GLQ          Rec
## 2186       TA     <NA>           No          BLQ          Unf
## 2218     <NA>       Fa           No          Unf          Unf
## 2219     <NA>       TA           No          Unf          Unf
## 2349       Gd       TA         <NA>          Unf          Unf
## 2525       TA     <NA>           Av          ALQ          Unf
#Special Case: Garage specific variables, there is a mismatch in NA values, will check the entries

complete_data_set[ !is.na(complete_data_set$GarageType) & is.na(complete_data_set$GarageFinish), 
                   c('GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')
                 ]
##      GarageCars GarageArea GarageType GarageCond GarageQual GarageFinish
## 2127          1        360     Detchd       <NA>       <NA>         <NA>
## 2577         NA         NA     Detchd       <NA>       <NA>         <NA>

Function to replace NA values

clean_na_func <- function(data_set_name) {

  #Basement special cases, missing values
  data_set_name$BsmtFinType2[333] <- names(which.max(table(data_set_name$BsmtFinType2)))
  data_set_name$BsmtExposure[c(949, 1488, 2349)] <- names(which.max(table(data_set_name$BsmtExposure)))
  data_set_name$BsmtCond[c(2041, 2186, 2525)] <- names(which.max(table(data_set_name$BsmtCond)))
  data_set_name$BsmtQual[c(2218, 2219)] <- names(which.max(table(data_set_name$BsmtQual)))
  
  #Garage special case, missing value
  data_set_name$GarageFinish[2127] <- names(which.max(table(data_set_name$GarageFinish)))
  data_set_name$GarageCond[2127] <- names(which.max(table(data_set_name$GarageCond)))
  data_set_name$GarageQual[2127] <- names(which.max(table(data_set_name$GarageQual)))

  data_set_name$GarageCars[2577] <- 0
  data_set_name$GarageArea[2577] <- 0
  data_set_name$GarageType[2577] <- "None"
  
  #correcting the error
  data_set_name$GarageYrBlt[2593] <- 2007 #GarageYrBlt=2207 looks like a typo, YearBuilt=2006, YearRemodAdd=2007.
  

  # For below variables where an absence of a feature is NA value, replacing it with 'None'

  ColVector <- c("Alley", "BsmtQual", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "PoolQC", "Fence",
                 "MiscFeature", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "BsmtCond")
  
  data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2, 
                    function(x) {
                            replace(x, is.na(x), "None")
                            }
                    )

  
# Missing value is replaced by 0 in below Numeric Variables
  
  ColVector <- c( "MasVnrArea", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath")
  
  data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2, 
                    function(x) {
                            replace(x, is.na(x), 0)
                            }
                    )

# Replacing with the maximum value (mode values) in below variables

  ColVector <- c("MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual", 
                 "Functional", "SaleType")
  
  data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2, 
                    function(x) {
                            replace(x, is.na(x), names(which.max(table(x))))
                            }
                    )

# Replacing the value with the median

  ColVector <- c("LotFrontage","GarageCars", "GarageArea", "BsmtFinSF1")

  data_set_name[,ColVector] <- apply(data_set_name[,ColVector], 2, 
                    function(x) {
                            replace(x, is.na(x), median(x, na.rm = T))
                            }
                    )

# Assuming that the Garage was built the same year when the house was built to replace the NA values

  data_set_name$GarageYrBlt[is.na(complete_data_set$GarageYrBlt)] <- data_set_name$YearBuilt[is.na(complete_data_set$GarageYrBlt)]
  
return(data_set_name) 
}

complete_data_set <- clean_na_func(complete_data_set)

MSZoning Variable

Boxplot of SalePrice by MSZoning

ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=factor(MSZoning), y=SalePrice)) +
 geom_boxplot(outlier.colour = "red", outlier.shape = 1)  +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)  + 
ggtitle("Plot 3: Box plot of SalePrice by MSZoning")

Relationship with GrLivArea

complete_data_set %>%
  select(MSZoning, GrLivArea) %>%
  group_by(MSZoning) %>%
  summarise_at(vars(GrLivArea), list(count = length,avg_houseSize = mean))
## # A tibble: 5 × 3
##   MSZoning count avg_houseSize
##   <chr>    <int>         <dbl>
## 1 C (all)     25         1134.
## 2 FV         139         1582.
## 3 RH          26         1478.
## 4 RL        2269         1536.
## 5 RM         460         1322.

Average size of a house in commercial is much smaller than that in village zone, so there is no data discrepancy in this predictor variable and can be used in the model.

Converting Quality variables to integer variables

convert_to_int_func <- function(data_set_name) {
  
  
  #There are various columns which has below scores, we would convert then to integer values based on the quality.
  
  score <- c('None'=0, 'Po'=1, 'Fa'=2, 'TA'=3, 'Gd'=4, 'Ex'=5)

  data_set_name$PoolQC <- as.integer(revalue(data_set_name$PoolQC, score))
  data_set_name$FireplaceQu <- as.integer(revalue(data_set_name$FireplaceQu, score))
  data_set_name$KitchenQual <- as.integer(revalue(data_set_name$KitchenQual, score))
  data_set_name$ExterQual <- as.integer(revalue(data_set_name$ExterQual, score))
  data_set_name$ExterCond <- as.integer(revalue(data_set_name$ExterCond, score))
  data_set_name$BsmtQual <- as.integer(revalue(data_set_name$BsmtQual, score))
  data_set_name$BsmtCond <- as.integer(revalue(data_set_name$BsmtCond, score))
  data_set_name$GarageQual <- as.integer(revalue(data_set_name$GarageQual, score))
  data_set_name$GarageCond <- as.integer(revalue(data_set_name$GarageCond, score))


  data_set_name$BsmtExposure <- as.integer(revalue(data_set_name$BsmtExposure, 
                                                 c('None'=0, 'No'=1, 'Mn'=2, 'Av'=3, 'Gd'=4)))

  data_set_name$BsmtFinType1 <- as.integer(revalue(data_set_name$BsmtFinType1, 
                                                 c('None'=0, 'Unf'=1, 'LwQ'=2, 'Rec'=3, 'BLQ'=4, 'ALQ'=5, 'GLQ'=6)))
  
  data_set_name$BsmtFinType2 <- as.integer(revalue(data_set_name$BsmtFinType2, 
                                                 c('None'=0, 'Unf'=1, 'LwQ'=2, 'Rec'=3, 'BLQ'=4, 'ALQ'=5, 'GLQ'=6)))

  data_set_name$LotShape <- as.integer(revalue(data_set_name$LotShape, 
                                             c('IR3'=0, 'IR2'=1, 'IR1'=2, 'Reg'=3)))

  data_set_name$GarageFinish<- as.integer(revalue(data_set_name$GarageFinish, 
                                                 c('None'=0, 'Unf'=1, 'RFn'=2, 'Fin'=3)))
  
  data_set_name$MasVnrType <- as.integer(revalue(data_set_name$MasVnrType, 
                                               c('None'=0, 'BrkCmn'=0, 'BrkFace'=1, 'Stone'=2)))
  
  data_set_name$Functional <- as.integer(revalue(data_set_name$Functional, 
                                                 c('Sal'=0, 'Sev'=1, 'Maj2'=2, 'Maj1'=3, 'Mod'=4, 'Min2'=5, 'Min1'=6, 'Typ'=7)))
  

return(data_set_name)
}

complete_data_set <- convert_to_int_func(complete_data_set)

table(sapply(complete_data_set, class))
## 
## character   integer   numeric 
##        28        43        11

Feature Engineering

Correlation Plot

numeric_train_data <- complete_data_set %>% filter(Set == "Train") %>% 
                                select(which(sapply(.,is.integer)), which(sapply(., is.numeric)))
#, use = "pairwise.complete.obs"
correlation <- cor( numeric_train_data ) %>%
                as.matrix( sort( correlation[,'SalePrice'], decreasing = TRUE )) #sort on decreasing correlations with SalePrice


corrplot.mixed(correlation, tl.col="black", tl.pos = "lt")

> From the correlation plot, we found a list of variables that are highly correlated with each other so we can remove one of them.

Excluding some variables based on correlation plot

column_drop_list <- c('Utilities','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','YearRemodAdd','YrSold',
                      'EnclosedPorch','X3SsnPorch','ScreenPorch','WoodDeckSF','GarageArea','GarageQual','GarageCond',
                      'ExterQual','ExterCond','KitchenQual','FireplaceQu','PoolArea','PoolQC'
                      )

complete_data_set <- complete_data_set[,!(names(complete_data_set) %in% column_drop_list)]

##Converting character variables to factors

table(sapply(complete_data_set, class))
## 
## character   integer   numeric 
##        27        35        13
# List of character variables
variable_classes <- sapply(complete_data_set, class)
names(variable_classes[which(variable_classes=="character")])
##  [1] "MSZoning"      "Street"        "Alley"         "LandContour"  
##  [5] "LotConfig"     "LandSlope"     "Neighborhood"  "Condition1"   
##  [9] "Condition2"    "BldgType"      "HouseStyle"    "RoofStyle"    
## [13] "RoofMatl"      "Exterior1st"   "Exterior2nd"   "Foundation"   
## [17] "Heating"       "HeatingQC"     "CentralAir"    "Electrical"   
## [21] "GarageType"    "PavedDrive"    "Fence"         "MiscFeature"  
## [25] "SaleType"      "SaleCondition" "Set"
convert_to_factor_func <- function(data_set_name) {
  
#Numeric factors that should be categorical
  data_set_name$MoSold <- factor(data_set_name$MoSold)
  data_set_name$MSSubClass <- factor(data_set_name$MSSubClass)
  
  data_set_name <- data_set_name %>% 
  mutate(Neighborhood = as.factor(Neighborhood),
         Neighborhood2 = fct_collapse(Neighborhood,
                                           "BluesteSawyer" = c("Blueste", "Sawyer"),
                                           "NPkVillMitchel" = c("NPkVill", "Mitchel")))
  
  data_set_name <- data_set_name %>% 
  mutate(LotShape = as.factor(LotShape),
         LotShape2 = fct_collapse(LotShape,
                                       "IR23" = c("IR2", "IR3")))

  data_set_name <- data_set_name %>% 
  mutate(LotConfig = as.factor(LotConfig),
         LotConfig2 = if_else(LotConfig == "CulDSac", 1, 0)) 


#Finding the Variables with character class
  variable_classes <- sapply(data_set_name, class)
  character_variables <- names(variable_classes[which(variable_classes=="character")])

# Converting them to factor
  data_set_name[character_variables] <- lapply(data_set_name[character_variables], factor)

  return(data_set_name)
}

complete_data_set <- convert_to_factor_func(complete_data_set)

Checking for outliers

Scatterplot to identify the outliers

#GrLivArea

ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=GrLivArea, y=SalePrice))+
        geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
        geom_text_repel(aes(label = ifelse(complete_data_set$GrLivArea[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))

#TotalBsmtSF

ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=TotalBsmtSF, y=SalePrice))+
        geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
        geom_text_repel(aes(label = ifelse(complete_data_set$TotalBsmtSF[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))

Removing outliers

complete_data_set <- complete_data_set[ !(complete_data_set$Id %in% c(524, 1299)), ]

complete_data_set %>% 
  filter(OverallCond == 2 & SalePrice > 12)
##    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  89         50  C (all)         105    8470   Pave  None        2         Lvl
## 2 251         30       RL          55    5350   Pave  None        2         Lvl
## 3 379         20       RL          88   11394   Pave  None        3         Lvl
## 4 399         30       RM          60    8967   Pave  None        3         Lvl
## 5 677         70       RM          60    9600   Pave  Grvl        3         Lvl
##   LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle
## 1    Corner       Gtl       IDOTRR      Feedr      Feedr     1Fam     1.5Fin
## 2    Inside       Gtl      BrkSide       Norm       Norm     1Fam     1Story
## 3    Corner       Gtl      StoneBr       Norm       Norm     1Fam     1Story
## 4    Corner       Gtl       IDOTRR       Norm       Norm     1Fam     1Story
## 5    Inside       Gtl      OldTown       Norm       Norm     1Fam     2Story
##   OverallQual OverallCond YearBuilt RoofStyle RoofMatl Exterior1st Exterior2nd
## 1           3           2      1915       Hip  CompShg     Plywood     Plywood
## 2           3           2      1940     Gable  CompShg     Wd Sdng     Plywood
## 3           9           2      2010       Hip  CompShg     VinylSd     VinylSd
## 4           5           2      1920     Gable  CompShg     MetalSd     MetalSd
## 5           4           2      1900     Gable  CompShg     AsbShng      Stucco
##   MasVnrType MasVnrArea Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 1          0          0     CBlock        3        2            1            1
## 2          0          0     CBlock        3        3            1            1
## 3          2        350      PConc        5        3            3            6
## 4          0          0     BrkTil        2        1            1            1
## 5          0          0     BrkTil        3        2            1            1
##   BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 1          0            1          0      1013        1013    GasA        TA
## 2          0            1          0       728         728    GasA        Ex
## 3       1445            1          0       411        1856    GasA        Ex
## 4          0            1          0       961         961    GasA        Gd
## 5          0            1          0      1095        1095    GasW        Fa
##   CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BedroomAbvGr
## 1          N      SBrkr      1013         0          513      1526            2
## 2          Y      SBrkr      1306         0            0      1306            3
## 3          Y      SBrkr      1856         0            0      1856            1
## 4          Y        Mix      1077         0            0      1077            2
## 5          N      SBrkr      1095       679            0      1774            4
##   KitchenAbvGr TotRmsAbvGrd Functional Fireplaces GarageType GarageYrBlt
## 1            1            6          7          0       None        1915
## 2            1            6          4          0       None        1940
## 3            1            8          7          1     Attchd        2010
## 4            1            6          2          0     Detchd        1920
## 5            2            8          5          0     2Types        1920
##   GarageFinish GarageCars PavedDrive OpenPorchSF Fence MiscFeature MiscVal
## 1            0          0          N           0 MnPrv        None       0
## 2            0          0          Y           0  GdWo        Shed     450
## 3            3          3          Y           0  None        None       0
## 4            1          1          N           0  None        None       0
## 5            1          3          N           0  None        None       0
##   MoSold SaleType SaleCondition SalePrice   Set TotalBaths RemodelAge
## 1     10    ConLD       Abnorml     85000 Train        1.0         27
## 2      5       WD        Normal     76500 Train        1.0         44
## 3      6      New       Partial    394432 Train        2.5          0
## 4     11       WD       Abnorml     67000 Train        1.0         57
## 5      5       WD        Normal     87000 Train        3.0         56
##   RemodelFlag TotalAreaSF TotalSF TotalPorchSF OverallScore GarageScore
## 1           1        2539    2539          156            6           0
## 2           1        2034    2034          263            6           0
## 3           0        3712    3712          113           18           9
## 4           1        2038    2038            0           10           1
## 5           1        2869    2869           90            8           4
##   ExterScore KitchenScore FireplaceScore GarageAreaScore PoolScore
## 1          4            2              0               0         0
## 2          3            2              0               0         0
## 3         12            5              5            2502         0
## 4          6            3              0             338         0
## 5          9            6              0            1558         0
##   Neighborhood2 LotShape2 LotConfig2
## 1        IDOTRR         2          0
## 2       BrkSide         2          0
## 3       StoneBr         3          0
## 4        IDOTRR         3          0
## 5       OldTown         3          0
complete_data_set <- complete_data_set %>% 
  mutate(OverallCond = if_else(Id == 379, 5L, OverallCond))

Id column is removed from both the datasets, Id is not a predictor.

Rechecking the Correlation values after cleaning the data

train_cleaned <- complete_data_set %>%
                   filter(Set == "Train")

test_cleaned <- complete_data_set %>%
                   filter(Set == "Test") 

#high correlation cor>0.3
for(col in colnames(train_cleaned)){
  if(is.numeric(train_cleaned[,col])){
    if(abs(cor(train_cleaned[,col],train_cleaned$SalePrice)) > 0.5 ){
      print (c(col, abs(cor(train_cleaned[,col],train_cleaned$SalePrice))))
    }
  }
}
## [1] "OverallQual"       "0.795774267424943"
## [1] "YearBuilt"         "0.523608400701364"
## [1] "BsmtQual"          "0.586674006281695"
## [1] "TotalBsmtSF"       "0.651152911047542"
## [1] "X1stFlrSF"         "0.631530379198999"
## [1] "GrLivArea"         "0.734968164535933"
## [1] "TotRmsAbvGrd"    "0.5377688162416"
## [1] "GarageYrBlt"       "0.508718778483912"
## [1] "GarageFinish"      "0.550254601749493"
## [1] "GarageCars"        "0.641047007641277"
## [1] "SalePrice" "1"        
## [1] "TotalBaths"        "0.635896297561239"
## [1] "RemodelAge"        "0.509705795839106"
## [1] "TotalAreaSF"       "0.829041978106551"
## [1] "TotalSF"           "0.829041978106551"
## [1] "OverallScore"      "0.566758909826381"
## [1] "ExterScore"        "0.578456395158882"
## [1] "FireplaceScore"    "0.514058085218827"
## [1] "GarageAreaScore"   "0.630015330054997"
# low correaltion cor<0.1
for(col in colnames(train_cleaned)){
  if(is.numeric(train_cleaned[,col])){
    if(abs(cor(train_cleaned[,col],train_cleaned$SalePrice))<0.1){
      print(c(col,cor(train_cleaned[,col],train_cleaned$SalePrice)))
    }
  }
}
## [1] "Id"                  "-0.0216728932732659"
## [1] "OverallCond"         "-0.0732347145215433"
## [1] "BsmtFinType2"         "-0.00437071368987331"
## [1] "BsmtFinSF2"          "-0.0114216653872752"
## [1] "LowQualFinSF"        "-0.0256247637384778"
## [1] "MiscVal"             "-0.0212031568730371"

Plots to study the important variables to be used in the model

#OverallQual

ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=factor(OverallQual), y=SalePrice)) +
 geom_boxplot(outlier.colour = "red", outlier.shape = 1)  +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) 

#YearBuilt

ggplot(data=complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=YearBuilt^15, y=SalePrice))+
        geom_point() + geom_smooth(method = "lm", se=FALSE, aes(group=1)) +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
        geom_text_repel(aes(label = ifelse(complete_data_set$YearBuilt[!is.na(complete_data_set$SalePrice)]>4500, rownames(complete_data_set), '')))

ggplot(complete_data_set[!is.na(complete_data_set$SalePrice),], aes(x=Neighborhood, y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue') +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma) +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) +
        geom_hline(yintercept=163000, linetype="dashed", color = "red") #dashed line is median SalePrice

hist.data.frame(complete_data_set %>%
  select("GrLivArea","RemodelAge"))

hist.data.frame(complete_data_set %>%
  select("GarageFinish","GarageCars","GarageAreaScore"))

hist.data.frame(complete_data_set %>%
  select("OverallScore","ExterScore","FireplaceScore","KitchenScore"))

hist.data.frame(complete_data_set %>%
  select("HeatingQC","CentralAir","FireplaceScore"))

hist.data.frame(complete_data_set %>%
  select("BsmtQual","TotalBsmtSF","BsmtFinSF1","TotalPorchSF","KitchenAbvGr"))

hist.data.frame(complete_data_set %>%
  select("X1stFlrSF","TotRmsAbvGrd","TotalBaths","TotalSF","OpenPorchSF"))

Linear Regression Model

tc <- trainControl(method = "cv", number = 5)

set.seed(123)

(lm_model <- train(log(SalePrice+1) ~ (factor(OverallQual)+factor(BsmtQual)*TotalBsmtSF+YearBuilt^20+log(X1stFlrSF+1)*TotRmsAbvGrd
                                        +factor(Neighborhood)+GarageFinish^3+GarageCars*TotalBaths+RemodelAge+
                                          TotalSF^2+OverallScore+ExterScore*MSZoning+
                                          factor(FireplaceScore)+Fireplaces+GarageAreaScore*log(GrLivArea+1)^14+
                                          HeatingQC^4+CentralAir^2+TotalPorchSF+BsmtFinSF1*MasVnrArea^2
                                        +KitchenScore*KitchenAbvGr^2+OverallQual+OpenPorchSF+TotalAreaSF),
                     data = select(train_cleaned, -Id),
                     preProcess = c("center","scale"),
                     trControl = tc,
                     method = "lm"))
## Linear Regression 
## 
## 1458 samples
##   28 predictor
## 
## Pre-processing: centered (92), scaled (92) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1167, 1167, 1165, 1166, 1167 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.1178501  0.9139376  0.0839691
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lm_model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72639 -0.05050  0.00508  0.05927  0.39042 
## 
## Coefficients: (3 not defined because of singularities)
##                                        Estimate Std. Error  t value Pr(>|t|)
## (Intercept)                          12.0240152  0.0029271 4107.856  < 2e-16
## `factor(OverallQual)2`                0.0033719  0.0048661    0.693 0.488466
## `factor(OverallQual)3`                0.0152254  0.0103134    1.476 0.140099
## `factor(OverallQual)4`                0.0496164  0.0233832    2.122 0.034027
## `factor(OverallQual)5`                0.0732820  0.0389946    1.879 0.060418
## `factor(OverallQual)6`                0.0631568  0.0388689    1.625 0.104421
## `factor(OverallQual)7`                0.0629336  0.0374449    1.681 0.093050
## `factor(OverallQual)8`                0.0536447  0.0294149    1.824 0.068411
## `factor(OverallQual)9`                0.0391766  0.0162109    2.417 0.015793
## `factor(OverallQual)10`               0.0202401  0.0105148    1.925 0.054446
## `factor(BsmtQual)2`                   0.0209218  0.0112978    1.852 0.064263
## `factor(BsmtQual)3`                   0.0015115  0.0157188    0.096 0.923407
## `factor(BsmtQual)4`                  -0.0062475  0.0152052   -0.411 0.681227
## `factor(BsmtQual)5`                  -0.0110794  0.0145039   -0.764 0.445062
## TotalBsmtSF                          -0.0174799  0.0220039   -0.794 0.427101
## YearBuilt                             0.0687105  0.0085293    8.056 1.70e-15
## `log(X1stFlrSF + 1)`                  0.0523072  0.0189461    2.761 0.005842
## TotRmsAbvGrd                          0.2687182  0.0963312    2.790 0.005352
## `factor(Neighborhood)Blueste`        -0.0006722  0.0032337   -0.208 0.835370
## `factor(Neighborhood)BrDale`         -0.0061193  0.0050081   -1.222 0.221964
## `factor(Neighborhood)BrkSide`         0.0151974  0.0074044    2.052 0.040312
## `factor(Neighborhood)ClearCr`         0.0143696  0.0051830    2.772 0.005639
## `factor(Neighborhood)CollgCr`         0.0176097  0.0091815    1.918 0.055323
## `factor(Neighborhood)Crawfor`         0.0272218  0.0065665    4.146 3.60e-05
## `factor(Neighborhood)Edwards`        -0.0049775  0.0084198   -0.591 0.554510
## `factor(Neighborhood)Gilbert`         0.0109144  0.0071586    1.525 0.127575
## `factor(Neighborhood)IDOTRR`          0.0082739  0.0069885    1.184 0.236644
## `factor(Neighborhood)MeadowV`        -0.0108154  0.0049596   -2.181 0.029375
## `factor(Neighborhood)Mitchel`         0.0009986  0.0061792    0.162 0.871639
## `factor(Neighborhood)NAmes`           0.0072094  0.0116658    0.618 0.536679
## `factor(Neighborhood)NoRidge`         0.0156518  0.0059703    2.622 0.008849
## `factor(Neighborhood)NPkVill`        -0.0024076  0.0038433   -0.626 0.531137
## `factor(Neighborhood)NridgHt`         0.0193344  0.0073596    2.627 0.008707
## `factor(Neighborhood)NWAmes`         -0.0016783  0.0071954   -0.233 0.815605
## `factor(Neighborhood)OldTown`         0.0038260  0.0103913    0.368 0.712787
## `factor(Neighborhood)Sawyer`          0.0015358  0.0073959    0.208 0.835526
## `factor(Neighborhood)SawyerW`         0.0043890  0.0064740    0.678 0.497921
## `factor(Neighborhood)Somerst`         0.0186482  0.0090395    2.063 0.039304
## `factor(Neighborhood)StoneBr`         0.0179401  0.0048405    3.706 0.000219
## `factor(Neighborhood)SWISU`           0.0010811  0.0053820    0.201 0.840824
## `factor(Neighborhood)Timber`          0.0087753  0.0054509    1.610 0.107653
## `factor(Neighborhood)Veenker`         0.0084372  0.0039164    2.154 0.031387
## GarageFinish                          0.0114660  0.0044744    2.563 0.010496
## GarageCars                            0.0356202  0.0114257    3.118 0.001862
## TotalBaths                            0.0323409  0.0089618    3.609 0.000319
## RemodelAge                           -0.0102093  0.0048319   -2.113 0.034789
## TotalSF                               0.1723086  0.0341987    5.038 5.32e-07
## OverallScore                          0.0775849  0.0059706   12.995  < 2e-16
## ExterScore                           -0.0152106  0.0349726   -0.435 0.663682
## MSZoningFV                            0.1104842  0.0435973    2.534 0.011381
## MSZoningRH                            0.0322097  0.0211992    1.519 0.128897
## MSZoningRL                            0.1612668  0.0576083    2.799 0.005192
## MSZoningRM                            0.0922414  0.0511111    1.805 0.071338
## `factor(FireplaceScore)1`             0.0043641  0.0066630    0.655 0.512598
## `factor(FireplaceScore)2`             0.0068817  0.0076451    0.900 0.368201
## `factor(FireplaceScore)3`             0.0151633  0.0199188    0.761 0.446634
## `factor(FireplaceScore)4`             0.0271374  0.0220991    1.228 0.219662
## `factor(FireplaceScore)5`             0.0072808  0.0067049    1.086 0.277719
## `factor(FireplaceScore)6`             0.0185677  0.0197627    0.940 0.347621
## `factor(FireplaceScore)8`             0.0246071  0.0196528    1.252 0.210751
## `factor(FireplaceScore)9`             0.0015631  0.0050299    0.311 0.756020
## `factor(FireplaceScore)10`            0.0011001  0.0062727    0.175 0.860806
## `factor(FireplaceScore)12`            0.0018226  0.0050184    0.363 0.716525
## `factor(FireplaceScore)15`            0.0012955  0.0050244    0.258 0.796560
## Fireplaces                           -0.0132099  0.0328540   -0.402 0.687689
## GarageAreaScore                       0.1390485  0.0793377    1.753 0.079892
## `log(GrLivArea + 1)`                  0.0208955  0.0210834    0.991 0.321816
## HeatingQCFa                          -0.0060860  0.0034395   -1.769 0.077044
## HeatingQCGd                          -0.0046495  0.0035210   -1.320 0.186889
## HeatingQCPo                          -0.0037036  0.0030324   -1.221 0.222168
## HeatingQCTA                          -0.0104059  0.0041372   -2.515 0.012009
## CentralAirY                           0.0102371  0.0038877    2.633 0.008553
## TotalPorchSF                          0.0177087  0.0037011    4.785 1.90e-06
## BsmtFinSF1                            0.0297297  0.0049065    6.059 1.77e-09
## MasVnrArea                            0.0015946  0.0052087    0.306 0.759551
## KitchenScore                          0.0026301  0.0112829    0.233 0.815712
## KitchenAbvGr                         -0.0365976  0.0143460   -2.551 0.010848
## OverallQual                                  NA         NA       NA       NA
## OpenPorchSF                           0.0004292  0.0036249    0.118 0.905773
## TotalAreaSF                                  NA         NA       NA       NA
## `factor(BsmtQual)2:TotalBsmtSF`      -0.0338750  0.0111822   -3.029 0.002497
## `factor(BsmtQual)3:TotalBsmtSF`      -0.0440706  0.0180919   -2.436 0.014980
## `factor(BsmtQual)4:TotalBsmtSF`      -0.0372368  0.0198128   -1.879 0.060397
## `factor(BsmtQual)5:TotalBsmtSF`              NA         NA       NA       NA
## `log(X1stFlrSF + 1):TotRmsAbvGrd`    -0.2814197  0.1044987   -2.693 0.007167
## `GarageCars:TotalBaths`              -0.0216757  0.0155648   -1.393 0.163965
## `ExterScore:MSZoningFV`              -0.0161061  0.0520054   -0.310 0.756836
## `ExterScore:MSZoningRH`               0.0114287  0.0219606    0.520 0.602857
## `ExterScore:MSZoningRL`               0.0186702  0.0786204    0.237 0.812325
## `ExterScore:MSZoningRM`               0.0377002  0.0618111    0.610 0.542012
## `GarageAreaScore:log(GrLivArea + 1)` -0.1246132  0.0830349   -1.501 0.133655
## `BsmtFinSF1:MasVnrArea`               0.0012162  0.0056251    0.216 0.828854
## `KitchenScore:KitchenAbvGr`           0.0175014  0.0204629    0.855 0.392550
##                                         
## (Intercept)                          ***
## `factor(OverallQual)2`                  
## `factor(OverallQual)3`                  
## `factor(OverallQual)4`               *  
## `factor(OverallQual)5`               .  
## `factor(OverallQual)6`                  
## `factor(OverallQual)7`               .  
## `factor(OverallQual)8`               .  
## `factor(OverallQual)9`               *  
## `factor(OverallQual)10`              .  
## `factor(BsmtQual)2`                  .  
## `factor(BsmtQual)3`                     
## `factor(BsmtQual)4`                     
## `factor(BsmtQual)5`                     
## TotalBsmtSF                             
## YearBuilt                            ***
## `log(X1stFlrSF + 1)`                 ** 
## TotRmsAbvGrd                         ** 
## `factor(Neighborhood)Blueste`           
## `factor(Neighborhood)BrDale`            
## `factor(Neighborhood)BrkSide`        *  
## `factor(Neighborhood)ClearCr`        ** 
## `factor(Neighborhood)CollgCr`        .  
## `factor(Neighborhood)Crawfor`        ***
## `factor(Neighborhood)Edwards`           
## `factor(Neighborhood)Gilbert`           
## `factor(Neighborhood)IDOTRR`            
## `factor(Neighborhood)MeadowV`        *  
## `factor(Neighborhood)Mitchel`           
## `factor(Neighborhood)NAmes`             
## `factor(Neighborhood)NoRidge`        ** 
## `factor(Neighborhood)NPkVill`           
## `factor(Neighborhood)NridgHt`        ** 
## `factor(Neighborhood)NWAmes`            
## `factor(Neighborhood)OldTown`           
## `factor(Neighborhood)Sawyer`            
## `factor(Neighborhood)SawyerW`           
## `factor(Neighborhood)Somerst`        *  
## `factor(Neighborhood)StoneBr`        ***
## `factor(Neighborhood)SWISU`             
## `factor(Neighborhood)Timber`            
## `factor(Neighborhood)Veenker`        *  
## GarageFinish                         *  
## GarageCars                           ** 
## TotalBaths                           ***
## RemodelAge                           *  
## TotalSF                              ***
## OverallScore                         ***
## ExterScore                              
## MSZoningFV                           *  
## MSZoningRH                              
## MSZoningRL                           ** 
## MSZoningRM                           .  
## `factor(FireplaceScore)1`               
## `factor(FireplaceScore)2`               
## `factor(FireplaceScore)3`               
## `factor(FireplaceScore)4`               
## `factor(FireplaceScore)5`               
## `factor(FireplaceScore)6`               
## `factor(FireplaceScore)8`               
## `factor(FireplaceScore)9`               
## `factor(FireplaceScore)10`              
## `factor(FireplaceScore)12`              
## `factor(FireplaceScore)15`              
## Fireplaces                              
## GarageAreaScore                      .  
## `log(GrLivArea + 1)`                    
## HeatingQCFa                          .  
## HeatingQCGd                             
## HeatingQCPo                             
## HeatingQCTA                          *  
## CentralAirY                          ** 
## TotalPorchSF                         ***
## BsmtFinSF1                           ***
## MasVnrArea                              
## KitchenScore                            
## KitchenAbvGr                         *  
## OverallQual                             
## OpenPorchSF                             
## TotalAreaSF                             
## `factor(BsmtQual)2:TotalBsmtSF`      ** 
## `factor(BsmtQual)3:TotalBsmtSF`      *  
## `factor(BsmtQual)4:TotalBsmtSF`      .  
## `factor(BsmtQual)5:TotalBsmtSF`         
## `log(X1stFlrSF + 1):TotRmsAbvGrd`    ** 
## `GarageCars:TotalBaths`                 
## `ExterScore:MSZoningFV`                 
## `ExterScore:MSZoningRH`                 
## `ExterScore:MSZoningRL`                 
## `ExterScore:MSZoningRM`                 
## `GarageAreaScore:log(GrLivArea + 1)`    
## `BsmtFinSF1:MasVnrArea`                 
## `KitchenScore:KitchenAbvGr`             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1118 on 1368 degrees of freedom
## Multiple R-squared:  0.9266, Adjusted R-squared:  0.9218 
## F-statistic:   194 on 89 and 1368 DF,  p-value: < 2.2e-16

Predictions

predictions <- data.frame(Id = test_cleaned$Id,
                          lm = exp(predict(lm_model, newdata= test_cleaned))
                          )

head(predictions)
##     Id       lm
## 1 1461 120070.5
## 2 1462 152337.9
## 3 1463 183249.0
## 4 1464 207566.9
## 5 1465 203255.8
## 6 1466 174096.9

In-sample RMSE value

rmse <- function(actual, predicted) sqrt(mean((actual - predicted)^2))

rmse(train_cleaned$SalePrice, exp(predict(lm_model, newdata= train_cleaned)) )
## [1] 20390.52

in-sample R-squared value

r2 <- function(actual, predicted){
  TSS <- sum((actual - mean(actual))^2)
  RSS <- sum((actual - predicted)^2)
  1 - RSS/TSS
}

r2(train_cleaned$SalePrice, exp(predict(lm_model, newdata= train_cleaned)))
## [1] 0.9341623

Predicted vs actual SalePrice plot

plot(exp(predict(lm_model, newdata= train_cleaned)),                                # Draw plot using Base R
     train_cleaned$SalePrice,
     xlab = "Predicted Values",
     ylab = "Observed Values")
abline(a = 0,                                        # Add straight line
       b = 1,
       col = "red",
       lwd = 2)

Submission

#kaggle score of 0.13                         
submit <- test_cleaned %>% 
  select(Id) %>% 
  mutate(SalePrice = exp(predict(object = lm_model, newdata = test_cleaned)))

write.csv(submit, "kaggle_submit.csv", row.names = F )
head(submit)
##     Id SalePrice
## 1 1461  120070.5
## 2 1462  152337.9
## 3 1463  183249.0
## 4 1464  207566.9
## 5 1465  203255.8
## 6 1466  174096.9